library(tidyverse)  # ggplot(), %>%, mutate(), and friends 
library(broom)
library(MatchIt)  # Match things
library(Rcpp)
library(MASS)
library(modelsummary)
library(IRdisplay)
library(haven)
library(sjmisc)
library(dplyr)
library(stringr)

# set directory
getwd()

setwd("C:/Users/Gian Maria/Desktop/Unitn/map/matching")

df <- read_csv('SHR76_20.csv')
wp <- read_csv('C:/Users/Gian Maria/Desktop/Unitn/map/WashingtonPost/data-homicides-master/homicide-data-geocodio-full.csv')




################## processing map 

df <-df %>%
  filter(Year > 2006 & Year <2018) # keep only homicides occurred from 2007 on

df <- dplyr::rename(df, City=Agency)


df <- dplyr::rename(df, Victim_Age=VicAge)
df <- dplyr::rename(df, Victim_Sex=VicSex)


# Code
df$Code <- str_c(df$Year ,"-", df$Month, "-", df$City, "-", df$Victim_Age, "-", df$Victim_Sex)
df$ID_age <- str_c(df$ID, "-", df$Victim_Age)
sum(duplicated(df$Code))
############### processing wp


# extract month and year
wp$Year <- str_sub(wp$reported_date, 1, 4)
wp$Month <- str_sub(wp$reported_date, 5, 6)

# substitute number with names of months
vec <- c("01"= "January", "02"= "February", "03"= "March", 
         "04"= "April",  "05" =  "May", "06"=  "June", "07"= "July",
         "08"=  "August", "09"=  "September", "10"= "October", 
         "11"= "November", "12"= "December")
wp$string_month <- vec[match(wp$Month, names(vec))]

# replace outcome variable
wp$disposition[wp$disposition=="Open/No arrest"] <- "No"
wp$disposition[wp$disposition=="Closed by arrest" | wp$disposition=="Closed without arrest" ] <- "Yes"

# rename columns
wp <- dplyr::rename(wp, Solved=disposition)
wp <- dplyr::rename(wp, City_Geoc=City)
wp <- dplyr::rename(wp, City=city)
wp <- dplyr::rename(wp, Victim_Age=victim_age)
wp <- dplyr::rename(wp, Victim_Sex=victim_sex)



# Code
wp$Code <- str_c(wp$Year, "-", wp$string_month, "-", wp$City, "-", wp$Victim_Age, "-", wp$Victim_Sex)
sum(duplicated(wp$Code))

# add % > 100k household
wp$ACS_Income_Higher100 <- rowSums(wp[,c("ACS Economics/Household income/$100,000 to $124,999/Percentage", 
                                         "ACS Economics/Household income/$125,000 to $149,999/Percentage", 
                                         "ACS Economics/Household income/$150,000 to $199,999/Percentage",
                                         "ACS Economics/Household income/$200,000 or more/Percentage")])
############ Reduced dfs

wpR = dplyr::select(wp, c('Code', 'Solved',
                          'ACS_Income_Higher100',
                          'ACS Demographics/Race and ethnicity/Not Hispanic or Latino: Black or African American alone/Percentage',
                          
                          ))
wpR$Source <- "WP"

dfR = dplyr::select(df, c('ID', 'ID_age', 'Code', 'Solved'))
dfR$Source <- "MAP"


#inner join of both wp and map
tR2 = merge(wpR, dfR, by="Code")


'''''find and remove duplicated rows (you remove these because if we have a
duplicate you do not know then who is who, so the matching does not have sense/
might be wrong/ you could do this step before merging as well)'''
tR2 <- tR2 %>% distinct(Code, .keep_all=TRUE)

# create numerical "Solved vars" for both sources in the unique dataset
tR2$Num_SolvWP<-ifelse(tR2$Solved.x=='Yes',1,0)
tR2$Num_SolvMAP<-ifelse(tR2$Solved.y=='Yes',1,0)

# create difference variable to see how many solved outcomes are equal across dfs
tR2$difference <- tR2$Num_SolvWP-tR2$Num_SolvMAP



#### now: two versions, keep only equal outcomes, and keep equal and switch discordant
tR3 <- filter(tR2, difference == 0) # keep only equal

tR3 = dplyr::select(tR3, c('ID', 'Code',
                          'ACS_Income_Higher100',
                          'ACS Demographics/Race and ethnicity/Not Hispanic or Latino: Black or African American alone/Percentage',
                          )) # select only 'ID' and 'Code', no need for anything else given that outcome is equal across sources

# now use MAP dataset, but remove beforehand duplicates to avoid confusion
df_nodup <- df %>% distinct(Code, .keep_all=TRUE)


# now merge 
equal_map <- merge(tR3, df_nodup, by='Code')

# now process discordant
tR4 <- dplyr::select(tR2, c('ID', 'Code', 'difference',
                          'ACS_Income_Higher100', 
                          'ACS Demographics/Race and ethnicity/Not Hispanic or Latino: Black or African American alone/Percentage',
                          ))

# now merge
discordant_map <- merge(tR4, df_nodup, by='Code')
# modify Solved variable based on WP outcome
discordant_map[discordant_map$difference == -1,]$Solved <- "No"
discordant_map[discordant_map$difference == 1,]$Solved <- "Yes"

# Save dataframes
write.csv(equal_map, 'robustness_matching_equal_wp_map_dataset_with_ses.csv')
write.csv(discordant_map, 'robustness_matching_discordant_wp_map_dataset_with_ses.csv')

